## The Role of Case Management in Misdemeanor Prosecution
## Lindsay Graef and Aurelie Ouss
##
## RUN BENCHMARKING 
## This script runs all benchmarking, for main analysis and robustness samples
## Then collects summaries of all benchmarking results and saves to .csv files
## This script can be run as a background job

library(raster)
library(gbm3)
library(fastDR)
library(survey)
library(daotools)
library(tidyr)
library(tidytable) 
library(tidyverse)
library(conflicted)

# Set conflict resolution for functions
conflicts_prefer(tidytable::filter(),
                 tidytable::arrange(),
                 tidytable::mutate(),
                 tidytable::rename(),
                 tidytable::left_join(),
                 tidytable::distinct(),
                 tidytable::summarize(),
                 tidytable::select(),
                 tidytable::case_when(),
                 tidytable::ifelse(),
                 tidytable::pivot_longer(),
                 tidytable::pivot_wider(),
                 tidytable::bind_rows(),
                 tidytable::`%in%`
)

# set seed for replication
set.seed(52285)

#-------------------------------------------------------------------------------#
# Functions---------------------------------------------------------------------#
#-------------------------------------------------------------------------------#

## Get ADA IDs
# creates random but consecutive numeric, anonymized ID numbers by ADA name
getADAIDs <- function(df) {
  
  df <- df %>%
    distinct(ada_name) %>%
    mutate(ada_ID = sample(n())) %>%
    arrange(ada_ID) %>%
    mutate(ada_ID = row_number())
  
  return(df)
}

## Collect Results
# function to collect all benchmarking results in a folder
# collects main results (default) or robustness results (robustness == TRUE)
collectResults <- function(robustness = FALSE){
  
  if(robustness == TRUE){
    setwd("/srv/data/penn/what_makes_an_effective_prosecutor/benchmark_res_robustness/")
  } else {
    setwd("/srv/data/penn/what_makes_an_effective_prosecutor/benchmark_res_hearing_team/")
  }
  
  filenames <- list.files()
  results <- list()
  
  for(file in filenames){
    
    a <- readRDS(file) 
    
    # recover ADA's name
    name <- gsub(".rds","",file) %>%
      gsub("_"," ", .)
    
    # identify which covariate had max KS
    max_ks_var <- names(which(abs(a$balance.tab[,3]) == a$ks))
    
    # pull out z-scores
    labels <- lapply(names(a$effects), function(x) paste0(x, "_z")) 
    z <- data.frame(matrix(ncol = length(labels), nrow = 0))
    names(z) <- labels
    z[1,] <- a$z
    
    # pull DR results, add column with ADA's name
    a <- a$effects %>%
      as.data.frame(.) %>%
      rownames_to_column(.) %>%
      filter(rowname == "dr") %>%
      mutate(ADA = name) %>%
      mutate(ks = a$ks,
             ks_var = max_ks_var[1], # just keep the first one for now--fix later
             n = a$n1,
             n_start = a$n1.start,
             n_end = a$n1.end,
             ess = a$ESS) %>%
      # add on columns with z scores
      cbind(., z)
    
    # append to main DF of results
    results <- append(results, list(a))
  }
  
  results <- bind_rows(results) %>%
    select(ADA, n, n_start, n_end, ess, ks, ks_var, everything()) %>%
    # rescale all z-scores to mean 0 sd 1
    mutate(across(ends_with("_z"), 
                  ~ (. - mean(., na.rm = TRUE)) / sd(., na.rm = TRUE), 
                  .names = "{.col}_norm"))
  
  return(results)
}


## Get Voter Info for ADAs
# takes dataframe of ADAs we have benchmarking results for and merges on
# voting data from Philadelphia county. Returns original df with voter info added.
# This function also includes some results from hand matching.
getVoterData <- function(df){
  
  dict <- read_excel("/srv/data/penn/what_makes_an_effective_prosecutor/voter_files/PA_FVE_datadictionary.xlsx")
  fve <- read.delim("/srv/data/penn/what_makes_an_effective_prosecutor/voter_files/PHILADELPHIA FVE 20210419.txt", header = FALSE)
  names(fve) <- dict$FieldName 
  
  # make name variations to match on in voter file
  fve_simp <- fve %>%
    select(IDNumber, FirstName, MiddleName, LastName, Gender, DOB, PartyCode) %>%
    mutate(full_name = str_squish(str_to_lower(paste(FirstName,MiddleName,LastName)))) %>%
    mutate(first_middle_last = str_squish(str_to_lower(paste(FirstName,MiddleName,LastName)))) %>%
    mutate(first_last = str_squish(str_to_lower(paste(FirstName,LastName)))) %>%
    mutate(MI = substr(MiddleName, 1, 1)) %>%
    mutate(first_mi_last = str_squish(str_to_lower(paste(FirstName,MI,LastName)))) %>%
    select(-MI) %>%
    mutate(first_middle_last = ifelse(first_middle_last == first_mi_last, "",first_middle_last)) %>%
    mutate(first_mi_last = ifelse(first_mi_last == first_last, "",first_mi_last)) %>%
    pivot_longer(., cols = c(first_middle_last, first_mi_last, first_last), 
                 names_to = "name_variation",
                 values_to = "name")
  
  # get list of unique ADAs we need to match & make name variations
  adas <- df %>%
    select(ada_ID, ADA) %>%
    mutate(name = gsub("'", "", ADA)) %>%
    mutate(name = gsub("-", " ", name)) %>%
    mutate(FirstName = str_extract(name, "^[a-zA-Z]+"),
           MiddleName = ifelse(grepl("[a-zA-Z]+ [a-zA-Z]+ [a-zA-Z]+", name), str_extract(name, " [a-zA-Z]+ "), ""),
           LastName = str_extract(name, " [a-zA-Z]+$")) %>%
    mutate(first_middle_last = str_squish(str_to_lower(paste(FirstName,MiddleName,LastName)))) %>%
    mutate(first_last = str_squish(str_to_lower(paste(FirstName,LastName)))) %>%
    mutate(MiddleName = str_trim(MiddleName)) %>%
    mutate(MI = substr(MiddleName, 1, 1)) %>%
    mutate(first_mi_last = str_squish(str_to_lower(paste(FirstName,MI,LastName)))) %>%
    select(-MI) %>%
    mutate(first_middle_last = ifelse(first_middle_last == first_mi_last, "",first_middle_last)) %>%
    mutate(first_mi_last = ifelse(first_mi_last == first_last, "",first_mi_last)) %>%
    select(ada_ID, ADA, first_middle_last, first_mi_last, first_last) %>%
    pivot_longer(., cols = c(first_middle_last, first_mi_last, first_last), 
                 names_to = "name_variation",
                 values_to = "name") 
  
  # Match ADAs to Voter records
  adas1 <- adas %>%
    filter(name_variation == "first_middle_last") %>%
    filter(name != "") %>%
    left_join(., fve_simp, by = c("name","name_variation")) %>%
    filter(!is.na(IDNumber))
  
  adas2 <- adas %>%
    filter(name_variation == "first_mi_last") %>%
    # drop ADAs we already matched on first_middle_last
    filter(name != "" & !(ADA %in% adas1$ADA)) %>%
    left_join(., fve_simp, by = c("name","name_variation")) %>%
    filter(!is.na(IDNumber))
  
  adas3 <- adas %>%
    filter(name_variation == "first_last") %>%
    # drop ADAs we already matched on first_middle_last or first_mi_last
    filter(name != "" & !(ADA %in% adas1$ADA) & !(ADA %in% adas2$ADA)) %>%
    left_join(., fve_simp, by = c("name","name_variation"))
  
  adas_merged <- adas1 %>%
    rbind(., adas2) %>%
    rbind(., adas3)
  
  # pull out 1-1 matches
  matches <- adas_merged %>%
    mutate(n_matches = n(), .by = ADA) %>%
    filter(n_matches == 1 & !is.na(IDNumber))
  
  no_match <- adas_merged %>%
    mutate(n_matches = n(), .by = ADA) %>%
    filter(n_matches == 1 & is.na(IDNumber))
  
  many_match <- adas_merged %>%
    mutate(n_matches = n(), .by = ADA) %>%
    filter(n_matches > 1)
  
  hand_matches <- read_csv("/srv/data/penn/what_makes_an_effective_prosecutor/voter_hand_matches.csv")
  
  matches <- matches %>%
    rbind(., hand_matches) %>%
    mutate(DOB = mdy(DOB)) %>%
    mutate(pol_party = case_when(PartyCode == "D" ~ "Democrat",
                                 PartyCode == "R" ~ "Republican",
                                 PartyCode == "NF" | PartyCode == "NON" ~ "No Affiliation",
                                 TRUE ~ NA))
  
  # merge voter data onto results dataframe
  df <- df %>%
    left_join(., matches) %>%
    select(-c(name_variation, name, FirstName, MiddleName, LastName, full_name, n_matches)) %>%
    rename(voter_ID = IDNumber) %>%
    select(ada_ID, ADA, DOB, Gender, PartyCode, pol_party, everything())
  
  return(df)
}


## Get Adjusted Z-scores
# Generate adjusted_z-values based on FDR generated p-values (q-values) using BY approach 
# Assumes arbitrary correlation in test statistics (ADA z-scores).  
getAdjustedZs <- function(results_team){
  
  # Get columns of p-values
  p_columns <- grep("\\.p$", names(results_team), value = TRUE)
  
  # Iterate over each column of p-values
  for(col in p_columns){
    new_col_prefix <- sub("\\.p$", "", col)  # Remove ".p" from the column name
    
    results_team <- results_team %>%
      # get q-value using BY method
      mutate(!!paste0(new_col_prefix, ".q") := p.adjust(!!sym(col), method = "BY")) %>%
      # get adjusted z-score
      mutate(!!paste0(new_col_prefix, ".adj_z") := qnorm(!!sym(paste0(new_col_prefix, ".q")))) %>%
      # get correct sign for z-score based on original value
      mutate(!!paste0(new_col_prefix, ".adj_z") := ifelse(!!sym(paste0(new_col_prefix, "_z")) < 0, 
                                                          -abs(!!sym(paste0(new_col_prefix, ".adj_z"))), 
                                                          abs(!!sym(paste0(new_col_prefix, ".adj_z"))))) %>%
      # set Inf z-scores to zero (no effect)
      mutate(!!paste0(new_col_prefix, ".adj_z") := ifelse(!!sym(paste0(new_col_prefix, ".adj_z")) == Inf | !!sym(paste0(new_col_prefix, ".adj_z")) == -Inf, 
                                                          0, !!sym(paste0(new_col_prefix, ".adj_z"))))
  }
  
  return(results_team)
}

#-------------------------------------------------------------------------------#
# Load / finalize dataset ------------------------------------------------------#
#-------------------------------------------------------------------------------#

# Get hearing-level dataset with teams of all MC ADAs to touch the case
df_team <- readRDS("/srv/data/penn/what_makes_an_effective_prosecutor/final_benchmark_df.rds") %>%
  # Keep only listings in MC room
  filter(court_room_num %in% c("403","406","503","506","603","606","703","706","803","806","903","906","1103")) %>%
  # keep hearings with at least one ADA identified 
  filter(!is.na(ada_name)) %>%
  # keep hearings for ADAs with caseloads > 50
  mutate(caseload = n(), .by = ada_name) %>%
  filter(caseload >= 50) %>%
  # drop levels of factor ADA names that don't exist after filtering
  droplevels(.) %>%
  # create anonymized ADA ID numbers, then join back to main dataset
  left_join(., getADAIDs(.), by = "ada_name") %>%
  # need key for unique docket-listing-adas 
  mutate(docket_listing_ada = paste(docket_number, listing_number, ada_number)) %>%
  # drop if outcomes are missing (drops one case) 
  filter(!is.na(recid_win_2yr_fromOGopen)) %>%
  mutate(convicted_acquitted = ifelse(conviction == 1 | acquitted == 1, 1, 0)) %>%
  mutate(dismissed = ifelse(dispo_type_update %in% c("404 Withdrawn","Dismissed/Withdrawn/Etc"), 1, 0))

# drop a few hearings with unclear / conflicting court and judge information (need distinct ADA-docket-listings)
i <- which(duplicated(df_team$docket_listing_ada))
a <- df_team[i,]
df_team <- df_team %>% 
  filter(!docket_listing_ada %in% a$docket_listing_ada)
rm(a,i)

#-------------------------------------------------------------------------------#
# Main Benchmarking Results ----------------------------------------------------#
#-------------------------------------------------------------------------------#

# NOTES:
# Uses all ADAs at any hearing (ADA teams). 
# Drops any other hearings on ADA i's cases from ever being used in their benchmark.
# Uses a while loop to drop cases that are rare / can't be matched until either the
# model converges with max_ks < 0.05 or the ADA's caseload drops below 50 cases.

DR.INPUT <- list(
  y.form      =~cw_fta_athearing + return_fta + disco_marked_closed + disco_incomplete +
    cw_notready_combo + marked_mbt + case_length_days + conviction + acquitted + convicted_acquitted +
    dismissed + punished + convicted_no_sentence + rearrest_win_2yr_fromOGopen + recid_win_2yr_fromOGopen,
  t.form      =~treat,
  x.form      =~def_age_arrest + male + black + hisp + lead_charge_offense_code_with_subsection + 
    violent + drug + property + dui + public_order + is_dv + number_of_charges + lead_charge_grade_atCU_levels +
    has_pic_dao + has_conspiracy_dao + has_vufa_dao + has_resisting_dao +
    prior_past_yr + pending_case_at_open + prior_misd_conviction + prior_fel_conviction +
    prior_violent_conviction + prior_incarceration + number_of_prior_arrests + on_probation +
    bail_posted_in_3days + def_atty_pd + judge + year + dc_district + listing_number, 
  key.form    =~docket_listing_ada)

adas_to_loop <- as.character(unique(df_team$ada_name))
counter <- 1
total_adas <- length(adas_to_loop)

# Adding check for if the run was already going & got timed out--
# see which ADAs have already been run & saved
saved_files <- list.files("/srv/data/penn/what_makes_an_effective_prosecutor/benchmark_res_hearing_team/", pattern = "\\.rds$", full.names = FALSE)
adas_done <- gsub("\\.rds$", "", saved_files)
adas_done <- gsub("_", " ", adas_done)
adas_to_loop <- setdiff(adas_to_loop, adas_done)
counter <- length(adas_done)


for(a in adas_to_loop){
  
  cat("Running ADA",a,"\n")
  cat("ADA number",counter,"out of",total_adas,"\n")
  
  # label ADA i's cases as "treat"
  df_team$treat <- ifelse(df_team$ada_name == a, 1, 0)
  
  # drop rows for teammates who ever handled the same case
  temp <- df_team %>%
    mutate(teammate = ifelse(any(treat == 1), 1, 0), .by = docket_number) %>%
    mutate(drop = ifelse(treat == 0 & teammate == 1, 1, 0)) %>%
    filter(drop == 0) 
  
  # keep track of how many treatment cases we start with
  n1.start      <- sum(temp$treat)
  shrinkage     <- 0.001
  dropped.cases <- NULL
  converged     <- FALSE
  
  tryCatch({
    # while not converged, keep fitting PS model again and again as long as we have at least 50 cases
    while(!converged){
      if(sum(temp$treat) < 50)
        stop("Insufficient number of cases (treated cases < 50)")
      
      dr2 <- fastDR(form.list = DR.INPUT,
                    data = temp,
                    y.dist = "gaussian",
                    n.trees = 3000,
                    interaction.depth = 3,
                    shrinkage = shrinkage[1], 
                    verbose = TRUE, 
                    ps.only = FALSE, 
                    keepGLM = FALSE, 
                    smooth.lm = 0)
      
      shrinkage[1] <- dr2$shrinkage
      
      if(length(dropped.cases)==0){
        balance.tab.un0 <- dr2$balance.tab.un
      }
      
      # must be set to 0, dropped control cases will not be in dr2$w
      temp$w <- temp$p <- 0
      i <- match(names(dr2$w),temp$docket_listing_ada)
      temp$w[i] <- dr2$w
      temp$p[i] <- dr2$p
      converged <- (dr2$ks <= 0.05)
      
      # if converged, will exit while loop. Otherwise, drop some cases and rerun
      if(!converged){
        # drop hearings if their propensity score is above 99th quantile of control PS
        p.cutoff <- with(temp, quantile(p[treat==0],0.99))
        i <- with(temp, (treat==0) | (treat==1 & p<=p.cutoff))
        dropped.cases <- c(dropped.cases, temp$docket_listing_ada[!i])
        cat("Dropping a total of",length(dropped.cases),"cases\n")
        temp <- temp[i,]
        temp$w <- NULL
        temp$p <- NULL 
      }
    }
  }, error = function(e) {
    cat("Error:", conditionMessage(e), "\n")
    cat("Skipping ADA", a, "\n")
  })
  
  # collect final results
  dr2$balance.tab.un0 <- balance.tab.un0
  dr2$n1.start <- n1.start
  dr2$n1.end <- sum(temp$treat)
  dr2$dropped.cases <- dropped.cases
  
  # save results
  ada_name <- gsub(" ","_",a)
  saveRDS(dr2, file = paste0("/srv/data/penn/what_makes_an_effective_prosecutor/benchmark_res_hearing_team/",ada_name,".rds"))
  
  cat("Finished ADA",a,"\n")
  counter <- counter + 1
}



#-------------------------------------------------------------------------------#
# Robustness: Benchmarking for Pre-Krasner ADAs and Cases ----------------------#
#-------------------------------------------------------------------------------#

df_robust <- df_team %>%
  filter(disposition_date < "2018-01-01") 

DR.INPUT <- list(
  y.form      =~cw_fta_athearing + return_fta + disco_marked_closed + disco_incomplete +
    cw_notready_combo + marked_mbt + case_length_days + conviction + acquitted + convicted_acquitted +
    dismissed + punished + convicted_no_sentence + rearrest_win_2yr_fromOGopen + recid_win_2yr_fromOGopen,
  t.form      =~treat,
  x.form      =~def_age_arrest + male + black + hisp + lead_charge_offense_code_with_subsection + 
    violent + drug + property + dui + public_order + is_dv + number_of_charges + lead_charge_grade_atCU_levels +
    has_pic_dao + has_conspiracy_dao + has_vufa_dao + has_resisting_dao +
    prior_past_yr + pending_case_at_open + prior_misd_conviction + prior_fel_conviction +
    prior_violent_conviction + prior_incarceration + number_of_prior_arrests + on_probation +
    bail_posted_in_3days + def_atty_pd + judge + year + dc_district + listing_number, 
  key.form    =~docket_listing_ada)

adas_to_loop <- as.character(unique(df_robust$ada_name))
counter <- 1
total_adas <- length(adas_to_loop)

# Adding check for if the run was already going & got timed out--
# see which ADAs have already been run & saved
saved_files <- list.files("/srv/data/penn/what_makes_an_effective_prosecutor/benchmark_res_robustness/", pattern = "\\.rds$", full.names = FALSE)
adas_done <- gsub("\\.rds$", "", saved_files)
adas_done <- gsub("_", " ", adas_done)
adas_to_loop <- setdiff(adas_to_loop, adas_done)
counter <- length(adas_done)


for(a in adas_to_loop){
  
  cat("Running ADA",a,"\n")
  cat("ADA number",counter,"out of",total_adas,"\n")
  
  # label ADA i's cases as "treat"
  df_robust$treat <- ifelse(df_robust$ada_name == a, 1, 0)
  
  # drop rows for teammates who ever handled the same case
  temp <- df_robust %>%
    mutate(teammate = ifelse(any(treat == 1), 1, 0), .by = docket_number) %>%
    mutate(drop = ifelse(treat == 0 & teammate == 1, 1, 0)) %>%
    filter(drop == 0) 
  
  # keep track of how many treatment cases I start with
  n1.start      <- sum(temp$treat)
  shrinkage     <- 0.001
  dropped.cases <- NULL
  converged     <- FALSE
  
  tryCatch({
    # while not converged, keep fitting PS model again and again as long as we have at least 50 cases
    while(!converged){
      if(sum(temp$treat) < 50)
        stop("Insufficient number of cases (treated cases < 50)")
      
      dr2 <- fastDR(form.list = DR.INPUT,
                    data = temp,
                    y.dist = "gaussian",
                    n.trees = 3000,
                    interaction.depth = 3,
                    shrinkage = shrinkage[1], 
                    verbose = TRUE, 
                    ps.only = FALSE, 
                    keepGLM = FALSE, 
                    smooth.lm = 0)
      
      shrinkage[1] <- dr2$shrinkage
      
      if(length(dropped.cases)==0){
        balance.tab.un0 <- dr2$balance.tab.un
      }
      
      # must be set to 0, dropped control cases will not be in dr2$w
      temp$w <- temp$p <- 0
      i <- match(names(dr2$w),temp$docket_listing_ada)
      temp$w[i] <- dr2$w
      temp$p[i] <- dr2$p
      converged <- (dr2$ks <= 0.05)
      
      # if converged, will exit while loop. Otherwise, drop some cases and rerun
      if(!converged){
        # drop hearings if their propensity score is above 99th quantile of control PS
        p.cutoff <- with(temp, quantile(p[treat==0],0.99))
        i <- with(temp, (treat==0) | (treat==1 & p<=p.cutoff))
        dropped.cases <- c(dropped.cases, temp$docket_listing_ada[!i])
        cat("Dropping a total of",length(dropped.cases),"cases\n")
        temp <- temp[i,]
        temp$w <- NULL
        temp$p <- NULL 
      }
    }
  }, error = function(e) {
    cat("Error:", conditionMessage(e), "\n")
    cat("Skipping ADA", a, "\n")
  })
  
  # collect final results
  dr2$balance.tab.un0 <- balance.tab.un0
  dr2$n1.start <- n1.start
  dr2$n1.end <- sum(temp$treat)
  dr2$dropped.cases <- dropped.cases
  
  # save results
  # note that this will save the results that didn't converge--will sort through later
  ada_name <- gsub(" ","_",a)
  saveRDS(dr2, file = paste0("/srv/data/penn/what_makes_an_effective_prosecutor/benchmark_res_robustness/",ada_name,".rds"))
  
  cat("Finished ADA",a,"\n")
  counter <- counter + 1
}



#-------------------------------------------------------------------------------#
# Pull together Benchmarking results -------------------------------------------#
#-------------------------------------------------------------------------------#

results_team <- collectResults() %>%
  # get subset of ADAs who we were able to match well (ks < 0.06)
  filter(ks <= 0.06) %>%
  # pull in ada ID numbers from main dataset
  left_join(., df_team %>% select(ada_name, ada_ID) %>% distinct(.), by = c("ADA" = "ada_name")) %>%
  select(ada_ID, ADA, everything()) %>%
  # merge voter data onto results dataframe
  getVoterData(.) %>%
  getAdjustedZs(.)

# Write results to .csv
write_csv(results_team, "/srv/data/penn/what_makes_an_effective_prosecutor/results_team.csv")

results_robust <- collectResults(robustness = T) %>%
  # get subset of ADAs who we were able to match well (ks < 0.06)
  filter(ks <= 0.06) %>%
  # pull in ada ID numbers from main dataset
  left_join(., df_team %>% select(ada_name, ada_ID) %>% distinct(.), by = c("ADA" = "ada_name")) %>%
  select(ada_ID, ADA, everything()) %>%
  # merge voter data onto results dataframe
  getVoterData(.) %>%
  getAdjustedZs(.)

write_csv(results_robust, "/srv/data/penn/what_makes_an_effective_prosecutor/results_robust.csv")
